c-library: libutf8proc.a libutf8proc.$(SHLIB_EXT)
clean:
- rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_EXT) normtest graphemetest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt
+ rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt
$(MAKE) -C bench clean
update: utf8proc_data.c.new
# real targets
-utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
- $(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new
+utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt
+ (cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > utf8proc_data.c.new
-UnicodeData.txt:
- $(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
+data/UnicodeData.txt:
+ $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
-GraphemeBreakProperty.txt:
- $(CURL) -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
+data/GraphemeBreakProperty.txt:
+ $(CURL) -o $@ -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
-DerivedCoreProperties.txt:
- $(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
+data/DerivedCoreProperties.txt:
+ $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
-CompositionExclusions.txt:
- $(CURL) -O http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
+data/CompositionExclusions.txt:
+ $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
-CaseFolding.txt:
- $(CURL) -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
+data/CaseFolding.txt:
+ $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c
$(cc) -c -o utf8proc.o utf8proc.c
# Test programs
-NormalizationTest.txt:
- $(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
+data/NormalizationTest.txt:
+ $(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
-GraphemeBreakTest.txt:
+data/GraphemeBreakTest.txt:
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
-normtest: normtest.c utf8proc.o utf8proc.h tests.h
- $(cc) normtest.c utf8proc.o -o $@
+test/normtest: test/normtest.c utf8proc.o utf8proc.h test/tests.h
+ $(cc) test/normtest.c utf8proc.o -o $@
-graphemetest: graphemetest.c utf8proc.o utf8proc.h tests.h
- $(cc) graphemetest.c utf8proc.o -o $@
+test/graphemetest: test/graphemetest.c utf8proc.o utf8proc.h test/tests.h
+ $(cc) test/graphemetest.c utf8proc.o -o $@
-printproperty: printproperty.c utf8proc.o utf8proc.h tests.h
- $(cc) printproperty.c utf8proc.o -o $@
+test/printproperty: test/printproperty.c utf8proc.o utf8proc.h test/tests.h
+ $(cc) test/printproperty.c utf8proc.o -o $@
-check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
- ./normtest
- ./graphemetest
+check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt
+ test/normtest data/NormalizationTest.txt
+ test/graphemetest data/GraphemeBreakTest.txt
--- /dev/null
+#!/usr/bin/env ruby
+
+# This file was used to generate the 'unicode_data.c' file by parsing the
+# Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
+# It is included for informational purposes only and not intended for
+# production use.
+
+
+# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+# This file contains derived data from a modified version of the
+# Unicode data files. The following license applies to that data:
+#
+# COPYRIGHT AND PERMISSION NOTICE
+#
+# Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
+# under the Terms of Use in http://www.unicode.org/copyright.html.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of the Unicode data files and any associated documentation (the "Data
+# Files") or Unicode software and any associated documentation (the
+# "Software") to deal in the Data Files or Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, and/or sell copies of the Data Files or Software, and
+# to permit persons to whom the Data Files or Software are furnished to do
+# so, provided that (a) the above copyright notice(s) and this permission
+# notice appear with all copies of the Data Files or Software, (b) both the
+# above copyright notice(s) and this permission notice appear in associated
+# documentation, and (c) there is clear notice in each modified Data File or
+# in the Software as well as in the documentation associated with the Data
+# File(s) or Software that the data or software has been modified.
+#
+# THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+# THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+# INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
+# CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+# USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+# PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+#
+# Except as contained in this notice, the name of a copyright holder shall
+# not be used in advertising or otherwise to promote the sale, use or other
+# dealings in these Data Files or Software without prior written
+# authorization of the copyright holder.
+
+
+$ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
+$ignorable = []
+$ignorable_list.each_line do |entry|
+ if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
+ $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
+ elsif entry =~ /^[0-9A-F]+/
+ $ignorable << $&.hex
+ end
+end
+
+$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
+$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
+$grapheme_boundclass_list.each_line do |entry|
+ if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
+ $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
+ elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
+ $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
+ end
+end
+
+$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
+$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
+
+$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
+$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
+
+$case_folding_string = File.open("CaseFolding.txt").read
+
+$case_folding = {}
+$case_folding_string.chomp.split("\n").each do |line|
+ next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
+ $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
+end
+
+$int_array = []
+$int_array_indicies = {}
+
+def str2c(string, prefix)
+ return "0" if string.nil?
+ return "UTF8PROC_#{prefix}_#{string.upcase}"
+end
+def ary2c(array)
+ return "NULL" if array.nil?
+ unless $int_array_indicies[array]
+ $int_array_indicies[array] = $int_array.length
+ array.each { |entry| $int_array << entry }
+ $int_array << -1
+ end
+ return "utf8proc_sequences + #{$int_array_indicies[array]}"
+end
+
+class UnicodeChar
+ attr_accessor :code, :name, :category, :combining_class, :bidi_class,
+ :decomp_type, :decomp_mapping,
+ :bidi_mirrored,
+ :uppercase_mapping, :lowercase_mapping, :titlecase_mapping
+ def initialize(line)
+ raise "Could not parse input." unless line =~ /^
+ ([0-9A-F]+); # code
+ ([^;]+); # name
+ ([A-Z]+); # general category
+ ([0-9]+); # canonical combining class
+ ([A-Z]+); # bidi class
+ (<([A-Z]*)>)? # decomposition type
+ ((\ ?[0-9A-F]+)*); # decompomposition mapping
+ ([0-9]*); # decimal digit
+ ([0-9]*); # digit
+ ([^;]*); # numeric
+ ([YN]*); # bidi mirrored
+ ([^;]*); # unicode 1.0 name
+ ([^;]*); # iso comment
+ ([0-9A-F]*); # simple uppercase mapping
+ ([0-9A-F]*); # simple lowercase mapping
+ ([0-9A-F]*)$/ix # simple titlecase mapping
+ @code = $1.hex
+ @name = $2
+ @category = $3
+ @combining_class = Integer($4)
+ @bidi_class = $5
+ @decomp_type = $7
+ @decomp_mapping = ($8=='') ? nil :
+ $8.split.collect { |element| element.hex }
+ @bidi_mirrored = ($13=='Y') ? true : false
+ @uppercase_mapping = ($16=='') ? nil : $16.hex
+ @lowercase_mapping = ($17=='') ? nil : $17.hex
+ @titlecase_mapping = ($18=='') ? nil : $18.hex
+ end
+ def case_folding
+ $case_folding[code]
+ end
+ def c_entry(comb1_indicies, comb2_indicies)
+ " " <<
+ "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
+ "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
+ "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
+ "#{ary2c decomp_mapping}, " <<
+ "#{ary2c case_folding}, " <<
+ "#{uppercase_mapping or -1}, " <<
+ "#{lowercase_mapping or -1}, " <<
+ "#{titlecase_mapping or -1}, " <<
+ "#{comb1_indicies[code] ?
+ (comb1_indicies[code]*comb2_indicies.keys.length) : -1
+ }, #{comb2_indicies[code] or -1}, " <<
+ "#{bidi_mirrored}, " <<
+ "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
+ "#{$ignorable.include?(code)}, " <<
+ "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
+ "#{$grapheme_boundclass[code]}},\n"
+ end
+end
+
+chars = []
+char_hash = {}
+
+while gets
+ if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
+ first = $1.hex
+ gets
+ char = UnicodeChar.new($_)
+ raise "No last character of sequence found." unless
+ $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
+ last = $1.hex
+ name = "<#{$2}>"
+ for i in first..last
+ char_clone = char.clone
+ char_clone.code = i
+ char_clone.name = name
+ char_hash[char_clone.code] = char_clone
+ chars << char_clone
+ end
+ else
+ char = UnicodeChar.new($_)
+ char_hash[char.code] = char
+ chars << char
+ end
+end
+
+comb1st_indicies = {}
+comb2nd_indicies = {}
+comb_array = []
+
+chars.each do |char|
+ if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and
+ char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
+ char_hash[char.decomp_mapping[0]].combining_class == 0 and
+ not $exclusions.include?(char.code)
+ unless comb1st_indicies[char.decomp_mapping[0]]
+ comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length
+ end
+ unless comb2nd_indicies[char.decomp_mapping[1]]
+ comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length
+ end
+ comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= []
+ raise "Duplicate canonical mapping" if
+ comb_array[comb1st_indicies[char.decomp_mapping[0]]][
+ comb2nd_indicies[char.decomp_mapping[1]]]
+ comb_array[comb1st_indicies[char.decomp_mapping[0]]][
+ comb2nd_indicies[char.decomp_mapping[1]]] = char.code
+ end
+end
+
+properties_indicies = {}
+properties = []
+chars.each do |char|
+ c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies)
+ unless properties_indicies[c_entry]
+ properties_indicies[c_entry] = properties.length
+ properties << c_entry
+ end
+end
+
+stage1 = []
+stage2 = []
+for code in 0...0x110000
+ next unless code % 0x100 == 0
+ stage2_entry = []
+ for code2 in code...(code+0x100)
+ if char_hash[code2]
+ stage2_entry << (properties_indicies[char_hash[code2].c_entry(
+ comb1st_indicies, comb2nd_indicies)] + 1)
+ else
+ stage2_entry << 0
+ end
+ end
+ old_index = stage2.index(stage2_entry)
+ if old_index
+ stage1 << (old_index * 0x100)
+ else
+ stage1 << (stage2.length * 0x100)
+ stage2 << stage2_entry
+ end
+end
+
+$stdout << "const int32_t utf8proc_sequences[] = {\n "
+i = 0
+$int_array.each do |entry|
+ i += 1
+ if i == 8
+ i = 0
+ $stdout << "\n "
+ end
+ $stdout << entry << ", "
+end
+$stdout << "};\n\n"
+
+$stdout << "const uint16_t utf8proc_stage1table[] = {\n "
+i = 0
+stage1.each do |entry|
+ i += 1
+ if i == 8
+ i = 0
+ $stdout << "\n "
+ end
+ $stdout << entry << ", "
+end
+$stdout << "};\n\n"
+
+$stdout << "const uint16_t utf8proc_stage2table[] = {\n "
+i = 0
+stage2.flatten.each do |entry|
+ i += 1
+ if i == 8
+ i = 0
+ $stdout << "\n "
+ end
+ $stdout << entry << ", "
+end
+$stdout << "};\n\n"
+
+$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
+$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
+properties.each { |line|
+ $stdout << line
+}
+$stdout << "};\n\n"
+
+$stdout << "const int32_t utf8proc_combinations[] = {\n "
+i = 0
+comb1st_indicies.keys.each_index do |a|
+ comb2nd_indicies.keys.each_index do |b|
+ i += 1
+ if i == 8
+ i = 0
+ $stdout << "\n "
+ end
+ $stdout << ( comb_array[a][b] or -1 ) << ", "
+ end
+end
+$stdout << "};\n\n"
+
+++ /dev/null
-#!/usr/bin/env ruby
-
-# This file was used to generate the 'unicode_data.c' file by parsing the
-# Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
-# It is included for informational purposes only and not intended for
-# production use.
-
-
-# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-# This file contains derived data from a modified version of the
-# Unicode data files. The following license applies to that data:
-#
-# COPYRIGHT AND PERMISSION NOTICE
-#
-# Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
-# under the Terms of Use in http://www.unicode.org/copyright.html.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of the Unicode data files and any associated documentation (the "Data
-# Files") or Unicode software and any associated documentation (the
-# "Software") to deal in the Data Files or Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, and/or sell copies of the Data Files or Software, and
-# to permit persons to whom the Data Files or Software are furnished to do
-# so, provided that (a) the above copyright notice(s) and this permission
-# notice appear with all copies of the Data Files or Software, (b) both the
-# above copyright notice(s) and this permission notice appear in associated
-# documentation, and (c) there is clear notice in each modified Data File or
-# in the Software as well as in the documentation associated with the Data
-# File(s) or Software that the data or software has been modified.
-#
-# THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
-# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
-# THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
-# INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
-# CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
-# USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-# PERFORMANCE OF THE DATA FILES OR SOFTWARE.
-#
-# Except as contained in this notice, the name of a copyright holder shall
-# not be used in advertising or otherwise to promote the sale, use or other
-# dealings in these Data Files or Software without prior written
-# authorization of the copyright holder.
-
-
-$ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
-$ignorable = []
-$ignorable_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
- $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
- elsif entry =~ /^[0-9A-F]+/
- $ignorable << $&.hex
- end
-end
-
-$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
-$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
-$grapheme_boundclass_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
- $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
- elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
- $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
- end
-end
-
-$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
-$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
-
-$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
-$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
-
-$case_folding_string = File.open("CaseFolding.txt").read
-
-$case_folding = {}
-$case_folding_string.chomp.split("\n").each do |line|
- next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
- $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
-end
-
-$int_array = []
-$int_array_indicies = {}
-
-def str2c(string, prefix)
- return "0" if string.nil?
- return "UTF8PROC_#{prefix}_#{string.upcase}"
-end
-def ary2c(array)
- return "NULL" if array.nil?
- unless $int_array_indicies[array]
- $int_array_indicies[array] = $int_array.length
- array.each { |entry| $int_array << entry }
- $int_array << -1
- end
- return "utf8proc_sequences + #{$int_array_indicies[array]}"
-end
-
-class UnicodeChar
- attr_accessor :code, :name, :category, :combining_class, :bidi_class,
- :decomp_type, :decomp_mapping,
- :bidi_mirrored,
- :uppercase_mapping, :lowercase_mapping, :titlecase_mapping
- def initialize(line)
- raise "Could not parse input." unless line =~ /^
- ([0-9A-F]+); # code
- ([^;]+); # name
- ([A-Z]+); # general category
- ([0-9]+); # canonical combining class
- ([A-Z]+); # bidi class
- (<([A-Z]*)>)? # decomposition type
- ((\ ?[0-9A-F]+)*); # decompomposition mapping
- ([0-9]*); # decimal digit
- ([0-9]*); # digit
- ([^;]*); # numeric
- ([YN]*); # bidi mirrored
- ([^;]*); # unicode 1.0 name
- ([^;]*); # iso comment
- ([0-9A-F]*); # simple uppercase mapping
- ([0-9A-F]*); # simple lowercase mapping
- ([0-9A-F]*)$/ix # simple titlecase mapping
- @code = $1.hex
- @name = $2
- @category = $3
- @combining_class = Integer($4)
- @bidi_class = $5
- @decomp_type = $7
- @decomp_mapping = ($8=='') ? nil :
- $8.split.collect { |element| element.hex }
- @bidi_mirrored = ($13=='Y') ? true : false
- @uppercase_mapping = ($16=='') ? nil : $16.hex
- @lowercase_mapping = ($17=='') ? nil : $17.hex
- @titlecase_mapping = ($18=='') ? nil : $18.hex
- end
- def case_folding
- $case_folding[code]
- end
- def c_entry(comb1_indicies, comb2_indicies)
- " " <<
- "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
- "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
- "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
- "#{ary2c decomp_mapping}, " <<
- "#{ary2c case_folding}, " <<
- "#{uppercase_mapping or -1}, " <<
- "#{lowercase_mapping or -1}, " <<
- "#{titlecase_mapping or -1}, " <<
- "#{comb1_indicies[code] ?
- (comb1_indicies[code]*comb2_indicies.keys.length) : -1
- }, #{comb2_indicies[code] or -1}, " <<
- "#{bidi_mirrored}, " <<
- "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
- "#{$ignorable.include?(code)}, " <<
- "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
- "#{$grapheme_boundclass[code]}},\n"
- end
-end
-
-chars = []
-char_hash = {}
-
-while gets
- if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
- first = $1.hex
- gets
- char = UnicodeChar.new($_)
- raise "No last character of sequence found." unless
- $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
- last = $1.hex
- name = "<#{$2}>"
- for i in first..last
- char_clone = char.clone
- char_clone.code = i
- char_clone.name = name
- char_hash[char_clone.code] = char_clone
- chars << char_clone
- end
- else
- char = UnicodeChar.new($_)
- char_hash[char.code] = char
- chars << char
- end
-end
-
-comb1st_indicies = {}
-comb2nd_indicies = {}
-comb_array = []
-
-chars.each do |char|
- if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and
- char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
- char_hash[char.decomp_mapping[0]].combining_class == 0 and
- not $exclusions.include?(char.code)
- unless comb1st_indicies[char.decomp_mapping[0]]
- comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length
- end
- unless comb2nd_indicies[char.decomp_mapping[1]]
- comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length
- end
- comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= []
- raise "Duplicate canonical mapping" if
- comb_array[comb1st_indicies[char.decomp_mapping[0]]][
- comb2nd_indicies[char.decomp_mapping[1]]]
- comb_array[comb1st_indicies[char.decomp_mapping[0]]][
- comb2nd_indicies[char.decomp_mapping[1]]] = char.code
- end
-end
-
-properties_indicies = {}
-properties = []
-chars.each do |char|
- c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies)
- unless properties_indicies[c_entry]
- properties_indicies[c_entry] = properties.length
- properties << c_entry
- end
-end
-
-stage1 = []
-stage2 = []
-for code in 0...0x110000
- next unless code % 0x100 == 0
- stage2_entry = []
- for code2 in code...(code+0x100)
- if char_hash[code2]
- stage2_entry << (properties_indicies[char_hash[code2].c_entry(
- comb1st_indicies, comb2nd_indicies)] + 1)
- else
- stage2_entry << 0
- end
- end
- old_index = stage2.index(stage2_entry)
- if old_index
- stage1 << (old_index * 0x100)
- else
- stage1 << (stage2.length * 0x100)
- stage2 << stage2_entry
- end
-end
-
-$stdout << "const int32_t utf8proc_sequences[] = {\n "
-i = 0
-$int_array.each do |entry|
- i += 1
- if i == 8
- i = 0
- $stdout << "\n "
- end
- $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "const uint16_t utf8proc_stage1table[] = {\n "
-i = 0
-stage1.each do |entry|
- i += 1
- if i == 8
- i = 0
- $stdout << "\n "
- end
- $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "const uint16_t utf8proc_stage2table[] = {\n "
-i = 0
-stage2.flatten.each do |entry|
- i += 1
- if i == 8
- i = 0
- $stdout << "\n "
- end
- $stdout << entry << ", "
-end
-$stdout << "};\n\n"
-
-$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
-properties.each { |line|
- $stdout << line
-}
-$stdout << "};\n\n"
-
-$stdout << "const int32_t utf8proc_combinations[] = {\n "
-i = 0
-comb1st_indicies.keys.each_index do |a|
- comb2nd_indicies.keys.each_index do |b|
- i += 1
- if i == 8
- i = 0
- $stdout << "\n "
- end
- $stdout << ( comb_array[a][b] or -1 ) << ", "
- end
-end
-$stdout << "};\n\n"
-
+++ /dev/null
-#include "tests.h"
-
-int main(void)
-{
- char *buf = NULL;
- size_t bufsize = 0;
- FILE *f = fopen("GraphemeBreakTest.txt", "r");
- uint8_t src[1024];
-
- check(f != NULL, "error opening GraphemeBreakTest.txt");
- while (getline(&buf, &bufsize, f) > 0) {
- size_t bi = 0, si = 0;
- lineno += 1;
-
- if (lineno % 100 == 0)
- printf("checking line %zd...\n", lineno);
-
- if (buf[0] == '#') continue;
-
- while (buf[bi]) {
- bi = skipspaces(buf, bi);
- if (buf[bi] == '/') { /* grapheme break */
- src[si++] = '/';
- bi++;
- }
- else if (buf[bi] == '+') { /* no break */
- bi++;
- }
- else if (buf[bi] == '#') { /* start of comments */
- break;
- }
- else { /* hex-encoded codepoint */
- bi += encode((char*) (src + si), buf + bi) - 1;
- while (src[si]) ++si; /* advance to NUL termination */
- }
- }
- if (si && src[si-1] == '/')
- --si; /* no break after final grapheme */
- src[si] = 0; /* NUL-terminate */
-
- if (si) {
- uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
- size_t i = 0, j = 0;
- ssize_t glen;
- uint8_t *g; /* utf8proc_map grapheme results */
- while (i < si) {
- if (src[i] != '/')
- utf8[j++] = src[i++];
- else
- i++;
- }
- glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
- if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
- /* the test file contains surrogate codepoints, which are only for UTF-16 */
- printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
- }
- else {
- check(glen >= 0, "utf8proc_map error = %s",
- utf8proc_errmsg(glen));
- for (i = 0; i <= glen; ++i)
- if (g[i] == 0xff)
- g[i] = '/'; /* easier-to-read output (/ is not in test strings) */
- check(!strcmp((char*)g, (char*)src),
- "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
- }
- free(g);
- }
- }
- fclose(f);
- printf("Passed tests after %zd lines!\n", lineno);
- return 0;
-}
+++ /dev/null
-#include "tests.h"
-
-#define CHECK_NORM(NRM, norm, src) { \
- char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \
- check(!strcmp(norm, src_norm), \
- "normalization failed for %s -> %s", src, norm); \
- free(src_norm); \
-}
-
-int main(void)
-{
- char *buf = NULL;
- size_t bufsize = 0;
- FILE *f = fopen("NormalizationTest.txt", "r");
- char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024];
-
- check(f != NULL, "error opening NormalizationTest.txt");
- while (getline(&buf, &bufsize, f) > 0) {
- size_t offset;
- lineno += 1;
-
- if (buf[0] == '@') {
- printf("line %zd: %s", lineno, buf + 1);
- continue;
- }
- else if (lineno % 1000 == 0)
- printf("checking line %zd...\n", lineno);
-
- if (buf[0] == '#') continue;
-
- offset = encode(source, buf);
- offset += encode(NFC, buf + offset);
- offset += encode(NFD, buf + offset);
- offset += encode(NFKC, buf + offset);
- offset += encode(NFKD, buf + offset);
-
- CHECK_NORM(NFC, NFC, source);
- CHECK_NORM(NFC, NFC, NFC);
- CHECK_NORM(NFC, NFC, NFD);
- CHECK_NORM(NFC, NFKC, NFKC);
- CHECK_NORM(NFC, NFKC, NFKD);
-
- CHECK_NORM(NFD, NFD, source);
- CHECK_NORM(NFD, NFD, NFC);
- CHECK_NORM(NFD, NFD, NFD);
- CHECK_NORM(NFD, NFKD, NFKC);
- CHECK_NORM(NFD, NFKD, NFKD);
-
- CHECK_NORM(NFKC, NFKC, source);
- CHECK_NORM(NFKC, NFKC, NFC);
- CHECK_NORM(NFKC, NFKC, NFD);
- CHECK_NORM(NFKC, NFKC, NFKC);
- CHECK_NORM(NFKC, NFKC, NFKD);
-
- CHECK_NORM(NFKD, NFKD, source);
- CHECK_NORM(NFKD, NFKD, NFC);
- CHECK_NORM(NFKD, NFKD, NFD);
- CHECK_NORM(NFKD, NFKD, NFKC);
- CHECK_NORM(NFKD, NFKD, NFKD);
- }
- fclose(f);
- printf("Passed tests after %zd lines!\n", lineno);
- return 0;
-}
+++ /dev/null
-/* simple test program to print out the utf8proc properties for a codepoint */
-
-#include "tests.h"
-
-int main(int argc, char **argv)
-{
- int i;
-
- for (i = 1; i < argc; ++i) {
- int c;
- check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
- const utf8proc_property_t *p = utf8proc_get_property(c);
- printf("U+%s:\n"
- " category = %d\n"
- " combining_class = %d\n"
- " bidi_class = %d\n"
- " decomp_type = %d\n"
- " uppercase_mapping = %x\n"
- " lowercase_mapping = %x\n"
- " titlecase_mapping = %x\n"
- " comb1st_index = %d\n"
- " comb2nd_index = %d\n"
- " bidi_mirrored = %d\n"
- " comp_exclusion = %d\n"
- " ignorable = %d\n"
- " control_boundary = %d\n"
- " boundclass = %d\n",
- argv[i],
- p->category,
- p->combining_class,
- p->bidi_class,
- p->decomp_type,
- p->uppercase_mapping,
- p->lowercase_mapping,
- p->titlecase_mapping,
- p->comb1st_index,
- p->comb2nd_index,
- p->bidi_mirrored,
- p->comp_exclusion,
- p->ignorable,
- p->control_boundary,
- p->boundclass);
- }
- return 0;
-}
--- /dev/null
+#include "tests.h"
+
+int main(int argc, char **argv)
+{
+ char *buf = NULL;
+ size_t bufsize = 0;
+ FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
+ uint8_t src[1024];
+
+ check(f != NULL, "error opening GraphemeBreakTest.txt");
+ while (getline(&buf, &bufsize, f) > 0) {
+ size_t bi = 0, si = 0;
+ lineno += 1;
+
+ if (lineno % 100 == 0)
+ printf("checking line %zd...\n", lineno);
+
+ if (buf[0] == '#') continue;
+
+ while (buf[bi]) {
+ bi = skipspaces(buf, bi);
+ if (buf[bi] == '/') { /* grapheme break */
+ src[si++] = '/';
+ bi++;
+ }
+ else if (buf[bi] == '+') { /* no break */
+ bi++;
+ }
+ else if (buf[bi] == '#') { /* start of comments */
+ break;
+ }
+ else { /* hex-encoded codepoint */
+ bi += encode((char*) (src + si), buf + bi) - 1;
+ while (src[si]) ++si; /* advance to NUL termination */
+ }
+ }
+ if (si && src[si-1] == '/')
+ --si; /* no break after final grapheme */
+ src[si] = 0; /* NUL-terminate */
+
+ if (si) {
+ uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
+ size_t i = 0, j = 0;
+ ssize_t glen;
+ uint8_t *g; /* utf8proc_map grapheme results */
+ while (i < si) {
+ if (src[i] != '/')
+ utf8[j++] = src[i++];
+ else
+ i++;
+ }
+ glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
+ if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
+ /* the test file contains surrogate codepoints, which are only for UTF-16 */
+ printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
+ }
+ else {
+ check(glen >= 0, "utf8proc_map error = %s",
+ utf8proc_errmsg(glen));
+ for (i = 0; i <= glen; ++i)
+ if (g[i] == 0xff)
+ g[i] = '/'; /* easier-to-read output (/ is not in test strings) */
+ check(!strcmp((char*)g, (char*)src),
+ "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
+ }
+ free(g);
+ }
+ }
+ fclose(f);
+ printf("Passed tests after %zd lines!\n", lineno);
+ return 0;
+}
--- /dev/null
+#include "tests.h"
+
+#define CHECK_NORM(NRM, norm, src) { \
+ char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \
+ check(!strcmp(norm, src_norm), \
+ "normalization failed for %s -> %s", src, norm); \
+ free(src_norm); \
+}
+
+int main(int argc, char **argv)
+{
+ char *buf = NULL;
+ size_t bufsize = 0;
+ FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
+ char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024];
+
+ check(f != NULL, "error opening NormalizationTest.txt");
+ while (getline(&buf, &bufsize, f) > 0) {
+ size_t offset;
+ lineno += 1;
+
+ if (buf[0] == '@') {
+ printf("line %zd: %s", lineno, buf + 1);
+ continue;
+ }
+ else if (lineno % 1000 == 0)
+ printf("checking line %zd...\n", lineno);
+
+ if (buf[0] == '#') continue;
+
+ offset = encode(source, buf);
+ offset += encode(NFC, buf + offset);
+ offset += encode(NFD, buf + offset);
+ offset += encode(NFKC, buf + offset);
+ offset += encode(NFKD, buf + offset);
+
+ CHECK_NORM(NFC, NFC, source);
+ CHECK_NORM(NFC, NFC, NFC);
+ CHECK_NORM(NFC, NFC, NFD);
+ CHECK_NORM(NFC, NFKC, NFKC);
+ CHECK_NORM(NFC, NFKC, NFKD);
+
+ CHECK_NORM(NFD, NFD, source);
+ CHECK_NORM(NFD, NFD, NFC);
+ CHECK_NORM(NFD, NFD, NFD);
+ CHECK_NORM(NFD, NFKD, NFKC);
+ CHECK_NORM(NFD, NFKD, NFKD);
+
+ CHECK_NORM(NFKC, NFKC, source);
+ CHECK_NORM(NFKC, NFKC, NFC);
+ CHECK_NORM(NFKC, NFKC, NFD);
+ CHECK_NORM(NFKC, NFKC, NFKC);
+ CHECK_NORM(NFKC, NFKC, NFKD);
+
+ CHECK_NORM(NFKD, NFKD, source);
+ CHECK_NORM(NFKD, NFKD, NFC);
+ CHECK_NORM(NFKD, NFKD, NFD);
+ CHECK_NORM(NFKD, NFKD, NFKC);
+ CHECK_NORM(NFKD, NFKD, NFKD);
+ }
+ fclose(f);
+ printf("Passed tests after %zd lines!\n", lineno);
+ return 0;
+}
--- /dev/null
+/* simple test program to print out the utf8proc properties for a codepoint */
+
+#include "tests.h"
+
+int main(int argc, char **argv)
+{
+ int i;
+
+ for (i = 1; i < argc; ++i) {
+ int c;
+ check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
+ const utf8proc_property_t *p = utf8proc_get_property(c);
+ printf("U+%s:\n"
+ " category = %d\n"
+ " combining_class = %d\n"
+ " bidi_class = %d\n"
+ " decomp_type = %d\n"
+ " uppercase_mapping = %x\n"
+ " lowercase_mapping = %x\n"
+ " titlecase_mapping = %x\n"
+ " comb1st_index = %d\n"
+ " comb2nd_index = %d\n"
+ " bidi_mirrored = %d\n"
+ " comp_exclusion = %d\n"
+ " ignorable = %d\n"
+ " control_boundary = %d\n"
+ " boundclass = %d\n",
+ argv[i],
+ p->category,
+ p->combining_class,
+ p->bidi_class,
+ p->decomp_type,
+ p->uppercase_mapping,
+ p->lowercase_mapping,
+ p->titlecase_mapping,
+ p->comb1st_index,
+ p->comb2nd_index,
+ p->bidi_mirrored,
+ p->comp_exclusion,
+ p->ignorable,
+ p->control_boundary,
+ p->boundclass);
+ }
+ return 0;
+}
--- /dev/null
+/* Common functions and includes for our test programs. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "../utf8proc.h"
+
+size_t lineno = 0;
+
+void check(int cond, const char *format, ...)
+{
+ if (!cond) {
+ va_list args;
+ fprintf(stderr, "line %zd: ", lineno);
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ fprintf(stderr, "\n");
+ exit(1);
+ }
+}
+
+size_t skipspaces(const char *buf, size_t i)
+{
+ while (isspace(buf[i])) ++i;
+ return i;
+}
+
+/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
+ separated by whitespace, and terminated by any character not in
+ [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
+ in dest, returning the number of bytes read from buf */
+size_t encode(char *dest, const char *buf)
+{
+ size_t i = 0, j, d = 0;
+ do {
+ int c;
+ i = skipspaces(buf, i);
+ for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
+ ; /* find end of hex input */
+ if (j == i) { /* no codepoint found */
+ dest[d] = 0; /* NUL-terminate destination string */
+ return i + 1;
+ }
+ check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
+ i = j; /* skip to char after hex input */
+ d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
+ } while (1);
+}
+
+++ /dev/null
-/* Common functions and includes for our test programs. */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <string.h>
-#include <stdarg.h>
-
-#include "utf8proc.h"
-
-size_t lineno = 0;
-
-void check(int cond, const char *format, ...)
-{
- if (!cond) {
- va_list args;
- fprintf(stderr, "line %zd: ", lineno);
- va_start(args, format);
- vfprintf(stderr, format, args);
- va_end(args);
- fprintf(stderr, "\n");
- exit(1);
- }
-}
-
-size_t skipspaces(const char *buf, size_t i)
-{
- while (isspace(buf[i])) ++i;
- return i;
-}
-
-/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
- separated by whitespace, and terminated by any character not in
- [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
- in dest, returning the number of bytes read from buf */
-size_t encode(char *dest, const char *buf)
-{
- size_t i = 0, j, d = 0;
- do {
- int c;
- i = skipspaces(buf, i);
- for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
- ; /* find end of hex input */
- if (j == i) { /* no codepoint found */
- dest[d] = 0; /* NUL-terminate destination string */
- return i + 1;
- }
- check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
- i = j; /* skip to char after hex input */
- d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
- } while (1);
-}
-